In [1]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

TO DO:

  • normalize the time series before taking the correlation
  • ✓ use normalized correlation measure
  • ✓ do a partitioning based on similarity
  • ✓ sort adjacency matrix based on similarity
  • ✓ visualize the partition
  • ✓ stackplot of activity broken down by partitition
  • use a better correlation metric--do the math correctly
    • ✓ take the log of activity before correlating
  • move most of this to the libraries
  • order groups reliably by time
    • in data
    • in plot
  • summarize groups
  • entity resolution on participants
  • convolve the time series with a declining 'response time'

In [2]:
%matplotlib inline

In [3]:
from bigbang.archive import Archive

arx = Archive("scipy-user",archive_dir="../archives")

Get the activity of a list


In [4]:
act = arx.get_activity()

Since are going to be computing correlations between N different time series data sets and that's an $O(N^2)$ operation, let's limit N.


In [5]:
cutoff = 20

def filtered_participants(cutoff):
    xc = act.sum() > cutoff
    return act.columns[xc]

filtered_participants(cutoff)[:10]


Out[5]:
Index([u'Chris.Barker@noaa.... (Christopher Barker)', u'Dharhas.Pothina@twdb.state.tx... (Dharhas Pothina)', u'Fernando.Perez at colorado.edu (Fernando Perez)', u'Giovanni.Samaey at cs.kuleuven.ac.be (Giovanni Samaey)', u'JRadinger@gmx... (Johannes Radinger)', u'Jerome.Kieffer@esrf... (Jerome Kieffer)', u'Karl.Young@ucsf.... (Karl Young)', u'L.Ulferts@hs-osnabrue... (L.Ulferts@hs-osnabrue...)', u'Pierre.RAYBAUT@C... (Pierre.RAYBAUT@C...)', u'R.Springuel@umit.maine.... (R. Padraic Springuel)'], dtype='object')

In [6]:
from scipy.stats.stats import pearsonr  

fc = filtered_participants(cutoff)

n = len(fc)

pc = np.zeros([n,n])

for i in range(0,n):
    for j in range(i,n):
        # since pearson correlation assumes normally distributed data
        # and we believe activity to be log-normally distributed (see Shalizi...)
        # we'll take the log of values here
        si = np.log1p(act[fc[i]])
        sj = np.log1p(act[fc[j]])

        c = pearsonr(si,sj)[0]
        pc[i,j] = c
        pc[j,i] = c
        
pc


Out[6]:
array([[ 1.        ,  0.11676837, -0.01500432, ..., -0.00551488,
        -0.00552229, -0.00711201],
       [ 0.11676837,  1.        , -0.01628325, ..., -0.00598495,
        -0.005993  , -0.00771822],
       [-0.01500432, -0.01628325,  1.        , ..., -0.00697589,
        -0.00698527, -0.00899615],
       ..., 
       [-0.00551488, -0.00598495, -0.00697589, ...,  1.        ,
        -0.00256745, -0.00330656],
       [-0.00552229, -0.005993  , -0.00698527, ..., -0.00256745,
         1.        ,  0.02930758],
       [-0.00711201, -0.00771822, -0.00899615, ..., -0.00330656,
         0.02930758,  1.        ]])

In [7]:
G = nx.Graph(pc)
labels = dict(enumerate(fc))
G = nx.relabel_nodes(G,labels)

In [8]:
plt.imshow(pc)


Out[8]:
<matplotlib.image.AxesImage at 0x7fb1e88d9c50>

In [9]:
# from http://sociograph.blogspot.com/2012/11/visualizing-adjacency-matrices-in-python.html

import networkx as nx
from matplotlib import pyplot, patches

def draw_adjacency_matrix(G, node_order=None, partitions=[], colors=[],cmap="Greys"):
    """
    - G is a networkx graph
    - node_order (optional) is a list of nodes, where each node in G
          appears exactly once
    - partitions is a list of node lists, where each node in G appears
          in exactly one node list
    - colors is a list of strings indicating what color each
          partition should be
    If partitions is specified, the same number of colors needs to be
    specified.
    """
    adjacency_matrix = nx.to_numpy_matrix(G, dtype=np.bool, nodelist=node_order)

    #Plot adjacency matrix in toned-down black and white
    fig = pyplot.figure(figsize=(5, 5)) # in inches
    pyplot.imshow(adjacency_matrix,
                  cmap=cmap,
                  interpolation="none")
    
    # The rest is just if you have sorted nodes by a partition and want to
    # highlight the module boundaries
    assert len(partitions) == len(colors)
    ax = pyplot.gca()
    current_idx = 0
    for partition, color in zip(partitions, colors):
        #for module in partition:
        ax.add_patch(patches.Rectangle((current_idx, current_idx),
                                      len(partition), # Width
                                      len(partition), # Height
                                      facecolor="none",
                                      edgecolor=color,
                                      linewidth="1"))
        current_idx += len(partition)

In [10]:
# order nodes by number of messages sent by each participant
o = list(act[fc].sum().order(ascending=False).keys())
draw_adjacency_matrix(G,node_order=o)



In [11]:
from sklearn import cluster

n_clusters = 5
sc = cluster.SpectralClustering(n_clusters=n_clusters)
partition = sc.fit_predict(pc)
partition_order = [fc[x] for x in np.argsort(partition)]


/home/sb/anaconda/envs/bigbang/lib/python2.7/site-packages/sklearn/cluster/spectral.py:421: UserWarning: The spectral clustering API has changed. ``fit``now constructs an affinity matrix from data. To use a custom affinity matrix, set ``affinity=precomputed``.
  warnings.warn("The spectral clustering API has changed. ``fit``"

In [12]:
np.sort(partition)


Out[12]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4], dtype=int32)

In [13]:
from collections import defaultdict

parts = defaultdict(list)
for i in range(len(partition)):
    parts[partition[i]].append(fc[i])
    
colors = [plt.cm.hsv(1. / (k + 1)) for k in parts.keys()]

In [14]:
colors


Out[14]:
[(1.0, 0.0, 0.09375, 1.0),
 (0.0, 1.0, 0.96470316317617644, 1.0),
 (0.03124934374934376, 1.0, 1.3125013124790507e-06, 1.0),
 (0.51764557646910592, 1.0, 0.0, 1.0),
 (0.81874881874881877, 1.0, 0.0, 1.0)]

In [15]:
len(parts.values())


Out[15]:
5

In [16]:
draw_adjacency_matrix(G,
                      node_order=partition_order,
                      partitions=parts.values(),
                      colors=colors)



In [17]:
for k,v in parts.items():
    print str(k) + ": " + str(len(v))


0: 36
1: 123
2: 70
3: 19
4: 32

In [18]:
node_parts = {}

for k,v in parts.items():
    for n in v:
        node_parts[n] = int(k)

In [19]:
import networkx as nx

#FG = nx.Graph(pc > .01)
FG = nx.from_numpy_matrix(pc * (pc > 0))

labels = dict(enumerate(fc))
FG = nx.relabel_nodes(FG,labels)
nx.set_node_attributes(FG,'corr-cluster',node_parts)


pos = nx.spring_layout(FG, k = 0.6, iterations = 1000)
nx.draw(FG,pos)
nx.write_gexf(FG,'corr.gexf')



In [21]:
from bigbang import plot

plot.stack(act,partition=parts.values(),smooth=7)